Model Selection
Data Generation
generate_distribution <- function(family="normal", size=1000, noise=0, location=0, scale=1){
if(family == "normal"){
x <- rnorm(size, location, scale)
} else if(family == "beta"){
x <- rbeta(size, location, scale)
} else if(family == "binomial"){
x <- rbinom(size, round(location)+1, scale/10^(nchar(round(scale))))
} else if(family == "chi"){
x <- rchisq(size, location, scale)
} else if(family == "exponential"){
x <- rexp(size, scale)
} else if(family == "F"){
x <- rf(size, location, scale+0.1)
} else if(family == "gamma"){
x <- rgamma(size, location, scale)
} else if(family == "lognormal"){
x <- rlnorm(size, location, scale)
} else if(family == "poisson"){
x <- rpois(size, location)
} else if(family == "t"){
x <- rt(size, location, scale)
} else if(family == "uniform"){
x <- runif(size, location, location*2)
} else if(family == "weibull"){
x <- rweibull(size, location, scale)
}
return(x)
}
df <- data.frame()
for(distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")){
for(i in 1:10){
size <- round(runif(1, 10, 10000))
location <- runif(1, 0.01, 10)
scale <- runif(1, 0.02, 10)
x <- generate_distribution(distribution, size=size, location=location, scale=scale)
density_Z <- parameters::normalize(density(x, n=20)$y)
# Extract features
data <- data.frame(
"Mean" = mean(x),
"SD" = sd(x),
"Median" = median(x),
"MAD" = mad(x, constant=1),
"Mean_Median_Distance" = mean(x) - median(x),
"Mean_Mode_Distance" = mean(x) - bayestestR::map_estimate(x),
"SD_MAD_Distance" = sd(x) - mad(x, constant=1),
"Mode" = bayestestR::map_estimate(x),
"Range" = diff(range(x)) / sd(x),
"IQR" = stats::IQR(x),
"Skewness" = skewness(x),
"Kurtosis" = kurtosis(x),
"Smoothness_Cor" = parameters::smoothness(density(x)$y, method="cor"),
"Smoothness_Diff" = parameters::smoothness(density(x)$y, method="diff"),
"Smoothness_Z_Cor_1" = parameters::smoothness(density_Z, method="cor", lag=1),
"Smoothness_Z_Diff_1" = parameters::smoothness(density_Z, method="diff", lag=1),
"Smoothness_Z_Cor_3" = parameters::smoothness(density_Z, method="cor", lag=3),
"Smoothness_Z_Diff_3" = parameters::smoothness(density_Z, method="diff", lag=3)
)
density_df <- as.data.frame(t(density_Z))
names(density_df) <- paste0("Density_", 1:ncol(density_df))
data <- cbind(data, density_df)
data$Distribution <- distribution
df <- rbind(df, data)
}
}
Preparation
# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]
# Data partitioning
trainIndex <- caret::createDataPartition(df$Distribution, p=0.8, list = FALSE)
train <- df[ trainIndex,]
test <- df[-trainIndex,]
# Parameters
fitControl <- caret::trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10,
classProbs = TRUE,
returnData = FALSE,
trim=TRUE,
allowParallel = TRUE)
# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)
Training
# Training
model_rf <- caret::train(Distribution ~ ., data = train,
method = "rf",
trControl = fitControl)
model_nb <- caret::train(Distribution ~ ., data = train,
method = "naive_bayes",
trControl = fitControl)
model_gbm <- caret::train(Distribution ~ ., data = train,
method = "gbm",
trControl = fitControl)
model_nnet <- caret::train(Distribution ~ ., data = train,
method = "nnet",
trControl = fitControl,
linout=0)
stopCluster(cluster) # explicitly shut down the cluster
Model Comparison
# collect resamples
results <- resamples(list(
"RandomForest" = model_rf,
"NaiveBayes"= model_nb,
"GBM"= model_gbm,
"Nnet" = model_nnet
))
# summarize the distributions
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: RandomForest, NaiveBayes, GBM, Nnet
## Number of resamples: 50
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## RandomForest 0.65000000 0.7739899 0.8248082 0.8176960 0.8553571 0.9500000
## NaiveBayes 0.38888889 0.6190476 0.6842105 0.6883850 0.7619048 0.9000000
## GBM 0.65000000 0.7727273 0.8090909 0.8118162 0.8620130 0.9473684
## Nnet 0.09090909 0.1625387 0.2222222 0.2160156 0.2631579 0.3529412
## NA's
## RandomForest 0
## NaiveBayes 0
## GBM 0
## Nnet 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## RandomForest 0.61748634 0.7523315 0.8076304 0.7997679 0.8420609 0.9452055
## NaiveBayes 0.33557047 0.5833731 0.6545423 0.6584340 0.7386431 0.8901099
## GBM 0.61643836 0.7512717 0.7910882 0.7933400 0.8493344 0.9420732
## Nnet 0.02173913 0.1019144 0.1669399 0.1574988 0.2130178 0.2889734
## NA's
## RandomForest 0
## NaiveBayes 0
## GBM 0
## Nnet 0
# dot plots of results
dotplot(results)

# Sizes
data.frame("RandomForest" = as.numeric(object.size(model_rf))/1000,
"NaiveBayes" = as.numeric(object.size(model_nb))/1000,
"GBM" = as.numeric(object.size(model_gbm))/1000,
"Nnet" = as.numeric(object.size(model_nnet))/1000)
## RandomForest NaiveBayes GBM Nnet
## 1 1153.152 445.848 3427.632 863.112
Best Model Inspecction
model <- model_rf
# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))
| Accuracy |
0.7916667 |
| Kappa |
0.7727273 |
| AccuracyLower |
0.5784872 |
| AccuracyUpper |
0.9286814 |
| AccuracyNull |
0.0833333 |
| AccuracyPValue |
0.0000000 |
| McnemarPValue |
NaN |
# Prediction Table
knitr::kable(confusion$table)
| beta |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| binomial |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| chi |
0 |
0 |
2 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
| exponential |
1 |
0 |
0 |
2 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| F |
0 |
0 |
0 |
0 |
2 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| gamma |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| lognormal |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
0 |
0 |
0 |
0 |
0 |
| normal |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
0 |
0 |
0 |
0 |
| poisson |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
0 |
0 |
0 |
| t |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| uniform |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
0 |
| weibull |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
# Prediction Figure
perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x=Distribution, y=Metric, fill=Type)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
geom_hline(aes(yintercept=0.5), linetype="dotted") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

Random Forest Training with Feature Selection
Data Generation
df <- data.frame()
for(distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")){
for(i in 1:10000){
size <- round(runif(1, 10, 10000))
location <- runif(1, 0.01, 10)
scale <- runif(1, 0.02, 10)
x <- generate_distribution(distribution, size=size, location=location, scale=scale)
density_Z <- parameters::normalize(density(x, n=100)$y)
# Extract features
data <- data.frame(
"Mean" = mean(x),
"SD" = sd(x),
"Median" = median(x),
"MAD" = mad(x, constant=1),
"Mean_Median_Distance" = mean(x) - median(x),
"Mean_Mode_Distance" = mean(x) - bayestestR::map_estimate(x),
"SD_MAD_Distance" = sd(x) - mad(x, constant=1),
"Mode" = bayestestR::map_estimate(x),
"Range" = diff(range(x)) / sd(x),
"IQR" = stats::IQR(x),
"Skewness" = skewness(x),
"Kurtosis" = kurtosis(x),
"Smoothness_Cor_1" = parameters::smoothness(density_Z, method="cor", lag=1),
"Smoothness_Diff_1" = parameters::smoothness(density_Z, method="diff", lag=1),
"Smoothness_Cor_5" = parameters::smoothness(density_Z, method="cor", lag=5),
"Smoothness_Diff_5" = parameters::smoothness(density_Z, method="diff", lag=5)
)
data$Distribution <- distribution
df <- rbind(df, data)
}
}
Preparation
# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]
# Data partitioning
trainIndex <- caret::createDataPartition(df$Distribution, p=0.8, list = FALSE)
train <- df[ trainIndex,]
test <- df[-trainIndex,]
# Parameters
fitControl <- caret::trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10,
classProbs = TRUE,
returnData = FALSE,
trim=TRUE,
allowParallel = TRUE)
# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)
Training
# Training
model <- caret::train(Distribution ~ ., data = train,
method = "rf",
trControl = fitControl)
stopCluster(cluster) # explicitly shut down the cluster
Model Inspecction
# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))
| Accuracy |
0.9768654 |
| Kappa |
0.9747622 |
| AccuracyLower |
0.9748834 |
| AccuracyUpper |
0.9787306 |
| AccuracyNull |
0.0833681 |
| AccuracyPValue |
0.0000000 |
| McnemarPValue |
NaN |
# Prediction Table
knitr::kable(confusion$table)
| beta |
1987 |
0 |
0 |
5 |
0 |
19 |
0 |
2 |
0 |
0 |
0 |
16 |
| binomial |
1 |
1955 |
0 |
0 |
1 |
0 |
0 |
0 |
49 |
0 |
1 |
1 |
| chi |
0 |
0 |
1977 |
3 |
0 |
40 |
1 |
1 |
0 |
3 |
0 |
49 |
| exponential |
4 |
0 |
5 |
1985 |
0 |
46 |
0 |
0 |
0 |
0 |
0 |
24 |
| F |
0 |
0 |
2 |
4 |
1985 |
1 |
6 |
0 |
0 |
3 |
0 |
15 |
| gamma |
4 |
1 |
11 |
2 |
2 |
1871 |
20 |
0 |
1 |
6 |
0 |
27 |
| lognormal |
0 |
0 |
0 |
0 |
7 |
2 |
1953 |
1 |
0 |
10 |
0 |
4 |
| normal |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1967 |
2 |
2 |
0 |
11 |
| poisson |
0 |
39 |
0 |
0 |
1 |
0 |
0 |
0 |
1942 |
2 |
0 |
3 |
| t |
0 |
0 |
1 |
0 |
3 |
4 |
9 |
0 |
2 |
1967 |
0 |
0 |
| uniform |
3 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
1998 |
1 |
| weibull |
1 |
3 |
4 |
1 |
0 |
17 |
11 |
29 |
2 |
0 |
1 |
1848 |
# Prediction Figure
perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x=Distribution, y=Metric, fill=Type)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
geom_hline(aes(yintercept=0.5), linetype="dotted") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

Reduce Size and Save
# Reduce size
model$finalModel$predicted <- NULL
model$finalModel$oob.times <- NULL
model$finalModel$y <- NULL
model$finalModel$votes <- NULL
model$control$indexOut <- NULL
model$control$index <- NULL
model$trainingData <- NULL
attr(model$terms,".Environment") <- c()
attr(model$formula,".Environment") <- c()
# Test
is.factor(predict(model, df))
## [1] TRUE
# Save
find_distribution <- model
save(find_distribution, file="find_distribution.rda")